This lab will introduce you to machine learning by predicting presence of a species of you choosing from observations and environmental data. We will largely follow guidance found at Species distribution modeling | R Spatial using slightly newer R packages and functions.
This first part of the lab involves fetching data for your species of interest, whether terrestrial or marine.
# load packages, installing if missing
if (!require(librarian)){
install.packages("librarian")
library(librarian)
}
## Loading required package: librarian
librarian::shelf(
dismo, dplyr, DT, ggplot2, here, htmltools, leaflet, mapview, purrr, raster, readr, rgbif, rgdal, rJava, sdmpredictors, sf, spocc, tidyr)
##
## The 'cran_repo' argument in shelf() was not set, so it will use
## cran_repo = 'https://cran.r-project.org' by default.
##
## To avoid this message, set the 'cran_repo' argument to a CRAN
## mirror URL (see https://cran.r-project.org/mirrors.html) or set
## 'quiet = TRUE'.
## Warning: multiple methods tables found for 'crop'
## Warning: multiple methods tables found for 'extend'
select <- dplyr::select # overwrite raster::select
options(readr.show_col_types = FALSE)
# set random seed for reproducibility
set.seed(42)
# directory to store data
dir_data <- here("data/sdm")
dir.create(dir_data, showWarnings = F, recursive = T)
Ursus arctos horribilis - Grizzly Bear
obs_csv <- file.path(dir_data, "obs.csv")
obs_geo <- file.path(dir_data, "obs.geojson")
redo <- TRUE
if (!file.exists(obs_geo) | redo){
# get species occurrence data from GBIF with coordinates
(res <- spocc::occ(
query = 'Ursus arctos horribilis',
from = 'gbif', has_coords = T,
limit = 10000))
# extract data frame from result
df <- res$gbif$data[[1]]
readr::write_csv(df, obs_csv)
# convert to points of observation from lon/lat columns in data frame
obs <- df %>%
sf::st_as_sf(
coords = c("longitude", "latitude"),
crs = st_crs(4326)) %>%
select(prov, key) # save space (joinable from obs_csv)
sf::write_sf(obs, obs_geo, delete_dsn=T)
}
obs <- sf::read_sf(obs_geo)
nrow(obs) # number of rows
## [1] 1496
# show points on map
mapview::mapview(obs, map.types = "Esri.WorldTopoMap")
Question: How many observations total are in GBIF for your species? (Hint: ?occ) 1,634
Question: Do you see any odd observations, like marine species on land or vice versa? If so, please see the Data Cleaning and explain what you did to fix or remove these points.
No odd observations are glaring in the initial map. There are some sightings in lower latitudes but this is in line with historic areas of their habitat so they could have just followed the Rockies.
I found one paper that gave me some ideas about predictors. It is cited here: Mowat G, Heard DC, Schwarz CJ (2013) Predicting Grizzly Bear Density in Western North America. PLoS ONE 8(12): e82757. https://doi.org/10.1371/journal.pone.0082757
Next, you’ll use the Species Distribution Model predictors R package sdmpredictors to get underlying environmental data for your observations. First you’ll get underlying environmental data for predicting the niche on the species observations. Then you’ll generate pseudo-absence points with which to sample the environment. The model will differentiate the environment of the presence points from the pseudo-absence points.
dir_env <- file.path(dir_data, "env")
# set a default data directory
options(sdmpredictors_datadir = dir_env)
# choosing terrestrial
env_datasets <- sdmpredictors::list_datasets(terrestrial = TRUE, marine = FALSE)
# show table of datasets
env_datasets %>%
select(dataset_code, description, citation) %>%
DT::datatable()
# choose datasets for a vector
env_datasets_vec <- c("WorldClim", "ENVIREM")
# get layers
env_layers <- sdmpredictors::list_layers(env_datasets_vec)
DT::datatable(env_layers)
# choose layers after some inspection and perhaps consulting literature
env_layers_vec <- c("WC_alt", "WC_bio1", "WC_bio2", "WC_bio4", "WC_tmean1", "ER_tri", "ER_topoWet")
# get layers
env_stack <- load_layers(env_layers_vec)
# interactive plot layers, hiding all but first (select others)
# mapview(env_stack, hide = T) # makes the html too big for Github
plot(env_stack, nc=2)
Crop the environmental rasters to a reasonable study area around our species observations.
obs_hull_geo <- file.path(dir_data, "obs_hull.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")
if (!file.exists(obs_hull_geo) | redo){
# make convex hull around points of observation
obs_hull <- sf::st_convex_hull(st_union(obs))
# save obs hull
write_sf(obs_hull, obs_hull_geo)
}
## Warning in CPL_write_ogr(obj, dsn, layer, driver,
## as.character(dataset_options), : GDAL Error 6: DeleteLayer() not supported by
## this dataset.
obs_hull <- read_sf(obs_hull_geo)
# show points on map
mapview(
list(obs, obs_hull))
## Warning in cbind(`Feature ID` = fid, mat): number of rows of result is not a
## multiple of vector length (arg 1)
if (!file.exists(env_stack_grd) | redo){
obs_hull_sp <- sf::as_Spatial(obs_hull)
env_stack <- raster::mask(env_stack, obs_hull_sp) %>%
raster::crop(extent(obs_hull_sp))
writeRaster(env_stack, env_stack_grd, overwrite=T)
}
env_stack <- stack(env_stack_grd)
# show map
# mapview(obs) +
# mapview(env_stack, hide = T) # makes html too big for Github
plot(env_stack, nc=2)
absence_geo <- file.path(dir_data, "absence.geojson")
pts_geo <- file.path(dir_data, "pts.geojson")
pts_env_csv <- file.path(dir_data, "pts_env.csv")
if (!file.exists(absence_geo) | redo){
# get raster count of observations
r_obs <- rasterize(
sf::as_Spatial(obs), env_stack[[1]], field=1, fun='count')
# show map
# mapview(obs) +
# mapview(r_obs)
# create mask for
r_mask <- mask(env_stack[[1]] > -Inf, r_obs, inverse=T)
# generate random points inside mask
absence <- dismo::randomPoints(r_mask, nrow(obs)) %>%
as_tibble() %>%
st_as_sf(coords = c("x", "y"), crs = 4326)
write_sf(absence, absence_geo, delete_dsn=T)
}
absence <- read_sf(absence_geo)
# show map of presence, ie obs, and absence
mapview(obs, col.regions = "green") +
mapview(absence, col.regions = "gray")
## Warning in cbind(`Feature ID` = fid, mat): number of rows of result is not a
## multiple of vector length (arg 1)
if (!file.exists(pts_env_csv) | redo){
# combine presence and absence into single set of labeled points
pts <- rbind(
obs %>%
mutate(
present = 1) %>%
select(present, key),
absence %>%
mutate(
present = 0,
key = NA)) %>%
mutate(
ID = 1:n()) %>%
relocate(ID)
write_sf(pts, pts_geo, delete_dsn=T)
# extract raster values for points
pts_env <- raster::extract(env_stack, as_Spatial(pts), df=TRUE) %>%
tibble() %>%
# join present and geometry columns to raster value results for points
left_join(
pts %>%
select(ID, present),
by = "ID") %>%
relocate(present, .after = ID) %>%
# extract lon, lat as single columns
mutate(
#present = factor(present),
lon = st_coordinates(geometry)[,1],
lat = st_coordinates(geometry)[,2]) %>%
select(-geometry)
write_csv(pts_env, pts_env_csv)
}
pts_env <- read_csv(pts_env_csv)
pts_env %>%
# show first 10 presence, last 10 absence
slice(c(1:10, (nrow(pts_env)-9):nrow(pts_env))) %>%
DT::datatable(
rownames = F,
options = list(
dom = "t",
pageLength = 20))
In the end this table is the data that feeds into our species distribution model (y ~ X), where:
y is the present column with values of 1 (present) or 0 (absent)X is all other columns: WC_alt, WC_bio1, WC_bio2, WC_bio4, WC_tmean1, ER_tri, ER_topoWet, lon, latIn the vein of exploratory data analyses, before going into modeling let’s look at the data. Specifically, let’s look at how obviously differentiated is the presence versus absence for each predictor – a more pronounced presence peak should make for a more confident model. A plot for a specific predictor and response is called a “term plot”. In this case we’ll look for predictors where the presence (present = 1) occupies a distinct “niche” from the background absence points (present = 0).
pts_env %>%
select(-ID) %>%
mutate(
present = factor(present)) %>%
pivot_longer(-present) %>%
ggplot() +
geom_density(aes(x = value, fill = present)) +
scale_fill_manual(values = alpha(c("gray", "green"), 0.5)) +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
theme_bw() +
facet_wrap(~name, scales = "free") +
theme(
legend.position = c(1, 0),
legend.justification = c(1, 0))
## Warning: Removed 161 rows containing non-finite values (stat_density).
librarian::shelf(
DT, dplyr, dismo, GGally, here, readr, tidyr)
##
## The 'cran_repo' argument in shelf() was not set, so it will use
## cran_repo = 'https://cran.r-project.org' by default.
##
## To avoid this message, set the 'cran_repo' argument to a CRAN
## mirror URL (see https://cran.r-project.org/mirrors.html) or set
## 'quiet = TRUE'.
select <- dplyr::select # overwrite raster::select
options(readr.show_col_types = F)
dir_data <- here("data/sdm")
pts_env_csv <- file.path(dir_data, "pts_env.csv")
pts_env <- read_csv(pts_env_csv)
nrow(pts_env)
## [1] 2992
datatable(pts_env, rownames = F)
Let’s look at a pairs plot (using GGally::ggpairs()) to show correlations between variables.
GGally::ggpairs(
select(pts_env, -ID),
aes(color = factor(present)))
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 21 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 15 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 26 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 26 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 26 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 26 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 26 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning: Removed 21 rows containing missing values (geom_point).
## Warning: Removed 26 rows containing missing values (geom_point).
## Warning: Removed 26 rows containing missing values (geom_point).
## Warning: Removed 26 rows containing missing values (geom_point).
## Warning: Removed 26 rows containing missing values (geom_point).
## Warning: Removed 26 rows containing missing values (geom_point).
## Warning: Removed 21 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 22 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 21 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 21 rows containing missing values
## Warning: Removed 15 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 22 rows containing missing values (geom_point).
## Warning: Removed 15 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 15 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 15 rows containing missing values
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 21 rows containing missing values (geom_point).
## Warning: Removed 15 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 21 rows containing missing values (geom_point).
## Warning: Removed 15 rows containing missing values (geom_point).
Pairs plot with present color coded.
Let’s setup a data frame with only the data we want to model by:
# setup model data
d <- pts_env %>%
select(-ID) %>% # remove terms we don't want to model
tidyr::drop_na() # drop the rows with NA values
nrow(d)
## [1] 2966
Let’s start as simply as possible with a linear model lm() on multiple predictors X to predict presence y using a simpler workflow.
# fit a linear model
mdl <- lm(present ~ ., data = d)
summary(mdl)
##
## Call:
## lm(formula = present ~ ., data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.38352 -0.24428 0.08412 0.25695 1.06721
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.2583063 0.3017944 0.856 0.392122
## WC_alt 0.0001677 0.0000329 5.097 3.66e-07 ***
## WC_bio1 -0.0366971 0.0071202 -5.154 2.72e-07 ***
## WC_bio2 0.0384416 0.0066214 5.806 7.09e-09 ***
## WC_bio4 -0.0105836 0.0013410 -7.892 4.14e-15 ***
## WC_tmean1 0.0194405 0.0080319 2.420 0.015562 *
## ER_tri -0.0004938 0.0002451 -2.015 0.043988 *
## ER_topoWet -0.0322490 0.0086590 -3.724 0.000199 ***
## lon 0.0075200 0.0011095 6.778 1.47e-11 ***
## lat 0.0404698 0.0055921 7.237 5.82e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3791 on 2956 degrees of freedom
## Multiple R-squared: 0.4271, Adjusted R-squared: 0.4254
## F-statistic: 244.9 on 9 and 2956 DF, p-value: < 2.2e-16
y_predict <- predict(mdl, d, type="response")
y_true <- d$present
range(y_predict)
## [1] -0.4601328 1.4267937
range(y_true)
## [1] 0 1
The problem with these predictions is that it ranges outside the possible values of present 1 and absent 0. (Later we’ll deal with converting values within this range to either 1 or 0 by applying a cutoff value; i.e. any values > 0.5 become 1 and below become 0.)
To solve this problem of constraining the response term to being between the two possible values, i.e. the probability \(p\) of being one or the other possible \(y\) values, we’ll apply the logistic transformation on the response term.
\[ logit(p_i) = \log_{e}\left( \frac{p_i}{1-p_i} \right) \] We can expand the expansion of the predicted term, i.e. the probability \(p\) of being either \(y\), with all possible predictors \(X\) whereby each coefficient \(b\) gets multiplied by the value of \(x\):
\[ \log_{e}\left( \frac{p_i}{1-p_i} \right) = b_0 + b_1 x_{1,i} + b_2 x_{2,i} + \cdots + b_k x_{k,i} \]
# fit a generalized linear model with a binomial logit link function
mdl <- glm(present ~ ., family = binomial(link="logit"), data = d)
summary(mdl)
##
## Call:
## glm(formula = present ~ ., family = binomial(link = "logit"),
## data = d)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.5320 -0.5241 -0.0529 0.6551 2.7247
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.5216545 2.3369165 -1.935 0.05300 .
## WC_alt 0.0011329 0.0002419 4.683 2.82e-06 ***
## WC_bio1 -0.2975124 0.0490732 -6.063 1.34e-09 ***
## WC_bio2 0.2924895 0.0494265 5.918 3.27e-09 ***
## WC_bio4 -0.0714106 0.0104748 -6.817 9.27e-12 ***
## WC_tmean1 0.1559532 0.0602603 2.588 0.00965 **
## ER_tri -0.0022047 0.0018692 -1.180 0.23819
## ER_topoWet -0.1323539 0.0728399 -1.817 0.06921 .
## lon 0.0458538 0.0077432 5.922 3.18e-09 ***
## lat 0.2938556 0.0407884 7.204 5.83e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4111.6 on 2965 degrees of freedom
## Residual deviance: 2510.5 on 2956 degrees of freedom
## AIC: 2530.5
##
## Number of Fisher Scoring iterations: 5
y_predict <- predict(mdl, d, type="response")
range(y_predict)
## [1] 0.0006461404 0.9988358742
Excellent, our response is now constrained between 0 and 1. Next, let’s look at the term plots to see the relationship between predictor and response.
# show term plots
termplot(mdl, partial.resid = TRUE, se = TRUE, main = F, ylim="free")
With a generalized additive model we can add “wiggle” to the relationship between predictor and response by introducing smooth s() terms.
librarian::shelf(mgcv)
##
## The 'cran_repo' argument in shelf() was not set, so it will use
## cran_repo = 'https://cran.r-project.org' by default.
##
## To avoid this message, set the 'cran_repo' argument to a CRAN
## mirror URL (see https://cran.r-project.org/mirrors.html) or set
## 'quiet = TRUE'.
# fit a generalized additive model with smooth predictors
mdl <- mgcv::gam(
formula = present ~ s(WC_alt) + s(WC_bio1) +
s(WC_bio2) + s(WC_bio4) + s(WC_tmean1) + s(ER_tri) + s(ER_topoWet) + s(lon) + s(lat),
family = binomial, data = d)
summary(mdl)
##
## Family: binomial
## Link function: logit
##
## Formula:
## present ~ s(WC_alt) + s(WC_bio1) + s(WC_bio2) + s(WC_bio4) +
## s(WC_tmean1) + s(ER_tri) + s(ER_topoWet) + s(lon) + s(lat)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.0511 0.4285 -4.786 1.7e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(WC_alt) 8.416 8.893 36.722 2.62e-05 ***
## s(WC_bio1) 8.969 8.999 115.799 < 2e-16 ***
## s(WC_bio2) 6.935 7.918 32.136 8.38e-05 ***
## s(WC_bio4) 8.997 9.000 142.691 < 2e-16 ***
## s(WC_tmean1) 8.766 8.957 85.578 < 2e-16 ***
## s(ER_tri) 7.462 8.404 24.796 0.00286 **
## s(ER_topoWet) 1.000 1.000 6.522 0.01066 *
## s(lon) 7.752 8.598 131.579 < 2e-16 ***
## s(lat) 8.099 8.761 184.895 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.681 Deviance explained = 63.2%
## UBRE = -0.44418 Scale est. = 1 n = 2966
# show term plots
plot(mdl, scale=0)
- Question: Which GAM environmental variables, and even range of values, seem to contribute most towards presence (above 0 response) versus absence (below 0 response)? - Presence: - “WC_tmean1” (Mean temperature (January)): Possibly for >0 and <-30 degrees but it seems to have a wide variance (dotted lines are not close) - Longitude: Possibly from -150 > -120 but it does not seem strong - “WC_bio4” (Temperature seasonality): Possibly between 70-110 - Absence: - “WC_bio1” (Annual mean temperature): >10 degrees - “WC_bio4” (Temperature seasonality): <70 and >110 - Longitude: Possibly from < -150 but it does not seem strong
Overall it seems like there are more variables that are better predictors for absence rather than presence but that could just be from my inexperience to read these charts.
Maxent is probably the most commonly used species distribution model (Elith 2011) since it performs well with few input data points, only requires presence points (and samples background for comparison) and is easy to use with a Java graphical user interface (GUI).
# load extra packages
librarian::shelf(
maptools, sf)
##
## The 'cran_repo' argument in shelf() was not set, so it will use
## cran_repo = 'https://cran.r-project.org' by default.
##
## To avoid this message, set the 'cran_repo' argument to a CRAN
## mirror URL (see https://cran.r-project.org/mirrors.html) or set
## 'quiet = TRUE'.
mdl_maxent_rds <- file.path(dir_data, "mdl_maxent.rds")
# show version of maxent
if (!interactive())
maxent()
## This is MaxEnt version 3.4.3
# get environmental rasters
# NOTE: the first part of Lab 1. SDM - Explore got updated to write this clipped environmental raster stack
env_stack_grd <- file.path(dir_data, "env_stack.grd")
env_stack <- stack(env_stack_grd)
plot(env_stack, nc=2)
# get presence-only observation points (maxent extracts raster values for you)
obs_geo <- file.path(dir_data, "obs.geojson")
obs_sp <- read_sf(obs_geo) %>%
sf::as_Spatial() # maxent prefers sp::SpatialPoints over newer sf::sf class
# fit a maximum entropy model
if (!file.exists(mdl_maxent_rds)){
mdl <- maxent(env_stack, obs_sp)
readr::write_rds(mdl, mdl_maxent_rds)
}
mdl <- read_rds(mdl_maxent_rds)
# plot variable contributions per predictor
plot(mdl)
# plot term plots
response(mdl)
- Question: Which Maxent environmental variables, and even range of values, seem to contribute most towards presence (closer to 1 response) and how might this differ from the GAM results? - “WC_bio1”: Temps < -5 degrees - “WC_tmean1”: Temps > 10 degrees - “ER_tri”: Values > 300 - “ER_topoWet”: Values < 0 The Maxent environmental variables add WC_bio1, ER_tri, and ER_topoWet to presence compared to the GAM results and takes away the WC_bio4.
# predict
y_predict <- predict(env_stack, mdl) #, ext=ext, progress='')
plot(y_predict, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')
# global knitr chunk options
knitr::opts_chunk$set(
warning = FALSE,
message = FALSE)
# load packages
librarian::shelf(
caret, # m: modeling framework
dplyr, ggplot2 ,here, readr,
pdp, # X: partial dependence plots
ranger, # m: random forest modeling
rpart, # m: recursive partition modeling
rpart.plot, # m: recursive partition plotting
rsample, # d: split train/test data
skimr, # d: skim summarize data table
vip) # X: variable importance
##
## The 'cran_repo' argument in shelf() was not set, so it will use
## cran_repo = 'https://cran.r-project.org' by default.
##
## To avoid this message, set the 'cran_repo' argument to a CRAN
## mirror URL (see https://cran.r-project.org/mirrors.html) or set
## 'quiet = TRUE'.
# options
options(
scipen = 999,
readr.show_col_types = F)
set.seed(42)
# graphical theme
ggplot2::theme_set(ggplot2::theme_light())
# paths
dir_data <- here("data/sdm")
pts_env_csv <- file.path(dir_data, "pts_env.csv")
# read data
pts_env <- read_csv(pts_env_csv)
d <- pts_env %>%
select(-ID) %>% # not used as a predictor x
mutate(
present = factor(present)) %>% # categorical response
na.omit() # drop rows with NA
skim(d)
| Name | d |
| Number of rows | 2966 |
| Number of columns | 10 |
| _______________________ | |
| Column type frequency: | |
| factor | 1 |
| numeric | 9 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| present | 0 | 1 | FALSE | 2 | 0: 1495, 1: 1471 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| WC_alt | 0 | 1 | 1276.32 | 827.89 | -43.00 | 550.00 | 1166.00 | 2020.75 | 3944.00 | ▇▇▅▃▁ |
| WC_bio1 | 0 | 1 | 0.71 | 7.05 | -17.40 | -3.20 | 0.00 | 4.40 | 23.30 | ▂▅▇▂▁ |
| WC_bio2 | 0 | 1 | 12.31 | 3.20 | 4.40 | 9.72 | 12.00 | 15.50 | 20.30 | ▂▇▇▇▁ |
| WC_bio4 | 0 | 1 | 95.02 | 28.90 | 22.89 | 76.78 | 85.06 | 113.39 | 161.12 | ▁▆▇▃▃ |
| WC_tmean1 | 0 | 1 | -12.58 | 10.09 | -34.30 | -19.60 | -12.10 | -6.30 | 13.30 | ▃▃▇▃▁ |
| ER_tri | 0 | 1 | 67.77 | 67.07 | 0.00 | 13.63 | 43.64 | 107.73 | 325.33 | ▇▃▂▁▁ |
| ER_topoWet | 0 | 1 | 9.85 | 1.99 | 6.16 | 8.06 | 9.66 | 11.44 | 14.87 | ▆▇▇▆▂ |
| lon | 0 | 1 | -121.20 | 15.18 | -165.46 | -128.90 | -115.88 | -110.49 | -100.36 | ▁▂▃▆▇ |
| lat | 0 | 1 | 51.63 | 9.89 | 26.96 | 44.49 | 51.12 | 60.55 | 70.23 | ▁▃▇▅▅ |
# create training set with 80% of full data
d_split <- rsample::initial_split(d, prop = 0.8, strata = "present")
d_train <- rsample::training(d_split)
# show number of rows present is 0 vs 1
table(d$present)
##
## 0 1
## 1495 1471
table(d_train$present)
##
## 0 1
## 1196 1176
# run decision stump model
mdl <- rpart(
present ~ ., data = d_train,
control = list(
cp = 0, minbucket = 5, maxdepth = 1))
mdl
## n= 2372
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 2372 1176 0 (0.5042159 0.4957841)
## 2) ER_topoWet>=10.855 747 99 0 (0.8674699 0.1325301) *
## 3) ER_topoWet< 10.855 1625 548 1 (0.3372308 0.6627692) *
# plot tree
par(mar = c(1, 1, 1, 1))
rpart.plot(mdl)
Decision tree illustrating the single split on feature x (left).
# decision tree with defaults
mdl <- rpart(present ~ ., data = d_train)
mdl
## n= 2372
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 2372 1176 0 (0.50421585 0.49578415)
## 2) ER_topoWet>=10.855 747 99 0 (0.86746988 0.13253012)
## 4) WC_alt< 2298.5 703 57 0 (0.91891892 0.08108108) *
## 5) WC_alt>=2298.5 44 2 1 (0.04545455 0.95454545) *
## 3) ER_topoWet< 10.855 1625 548 1 (0.33723077 0.66276923)
## 6) lat< 43.41538 191 8 0 (0.95811518 0.04188482) *
## 7) lat>=43.41538 1434 365 1 (0.25453278 0.74546722)
## 14) WC_bio4>=116.22 191 63 0 (0.67015707 0.32984293)
## 28) lat< 69.3772 172 44 0 (0.74418605 0.25581395) *
## 29) lat>=69.3772 19 0 1 (0.00000000 1.00000000) *
## 15) WC_bio4< 116.22 1243 237 1 (0.19066774 0.80933226)
## 30) WC_alt< 1806.5 673 205 1 (0.30460624 0.69539376)
## 60) lat< 48.307 46 7 0 (0.84782609 0.15217391) *
## 61) lat>=48.307 627 166 1 (0.26475279 0.73524721)
## 122) lon< -151.3747 52 19 0 (0.63461538 0.36538462) *
## 123) lon>=-151.3747 575 133 1 (0.23130435 0.76869565) *
## 31) WC_alt>=1806.5 570 32 1 (0.05614035 0.94385965) *
rpart.plot(mdl)
# plot complexity parameter
plotcp(mdl)
# rpart cross validation results
mdl$cptable
## CP nsplit rel error xerror xstd
## 1 0.44982993 0 1.0000000 1.0229592 0.02070501
## 2 0.14880952 1 0.5501701 0.5544218 0.01848942
## 3 0.05527211 2 0.4013605 0.4047619 0.01658662
## 4 0.03401361 3 0.3460884 0.3681973 0.01599809
## 5 0.01615646 4 0.3120748 0.3231293 0.01519049
## 6 0.01360544 5 0.2959184 0.2950680 0.01463567
## 7 0.01190476 7 0.2687075 0.2899660 0.01453006
## 8 0.01000000 8 0.2568027 0.2882653 0.01449452
Decision tree \(present\) classification.
Question: Based on the complexity plot threshold, what size of tree is recommended? I believe it is recommending a tree of size 9.
# caret cross validation results
mdl_caret <- train(
present ~ .,
data = d_train,
method = "rpart",
trControl = trainControl(method = "cv", number = 10),
tuneLength = 20)
ggplot(mdl_caret)
Cross-validated accuracy rate for the 20 different \(\alpha\) parameter values in our grid search. Lower \(\alpha\) values (deeper trees) help to minimize errors.
vip(mdl_caret, num_features = 40, bar = FALSE)
Variable importance based on the total reduction in MSE for the Ames Housing decision tree.
Question: what are the top 3 most important variables of your model? Latitude, WC_alt, and longitude
# Construct partial dependence plots
p1 <- partial(mdl_caret, pred.var = "lat") %>% autoplot()
p2 <- partial(mdl_caret, pred.var = "WC_alt") %>% autoplot()
p3 <- partial(mdl_caret, pred.var = c("lat", "WC_alt")) %>%
plotPartial(levelplot = FALSE, zlab = "yhat", drape = TRUE,
colorkey = TRUE, screen = list(z = -20, x = -60))
# Display plots side by side
gridExtra::grid.arrange(p1, p2, p3, ncol = 3)
Partial dependence plots to understand the relationship between lat, WC_alt and present.
# number of features
n_features <- length(setdiff(names(d_train), "present"))
# fit a default random forest model
mdl_rf <- ranger(present ~ ., data = d_train)
# get out of the box RMSE
(default_rmse <- sqrt(mdl_rf$prediction.error))
## [1] 0.2932633
# re-run model with impurity-based variable importance
mdl_impurity <- ranger(
present ~ ., data = d_train,
importance = "impurity")
# re-run model with permutation-based variable importance
mdl_permutation <- ranger(
present ~ ., data = d_train,
importance = "permutation")
p1 <- vip::vip(mdl_impurity, bar = FALSE)
p2 <- vip::vip(mdl_permutation, bar = FALSE)
gridExtra::grid.arrange(p1, p2, nrow = 1)
Most important variables based on impurity (left) and permutation (right).
Question: How might variable importance differ between rpart and RandomForest in your model outputs? Based on the permutation model the most important variables are latitude, WC_alt, and longitude. This is the same as the RandomForest outputs. The scale on the bottom is different but I am not sure why.
Now you’ll complete the modeling workflow with the steps to evaluate model performance and calibrate model parameters.
Full model workflow with calibrate and evaluate steps emphasized.
# load packages
librarian::shelf(
dismo, # species distribution modeling: maxent(), predict(), evaluate(),
dplyr, ggplot2, GGally, here, maptools, readr,
raster, readr, rsample, sf,
usdm) # uncertainty analysis for species distribution models: vifcor()
select = dplyr::select
# options
set.seed(42)
options(
scipen = 999,
readr.show_col_types = F)
ggplot2::theme_set(ggplot2::theme_light())
# paths
dir_data <- here("data/sdm")
pts_geo <- file.path(dir_data, "pts.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")
mdl_maxv_rds <- file.path(dir_data, "mdl_maxent_vif.rds")
# read points of observation: presence (1) and absence (0)
pts <- read_sf(pts_geo)
# read raster stack of environment
env_stack <- raster::stack(env_stack_grd)
# create training set with 80% of full data
pts_split <- rsample::initial_split(
pts, prop = 0.8, strata = "present")
pts_train <- rsample::training(pts_split)
pts_test <- rsample::testing(pts_split)
pts_train_p <- pts_train %>%
filter(present == 1) %>%
as_Spatial()
pts_train_a <- pts_train %>%
filter(present == 0) %>%
as_Spatial()
# show pairs plot before multicollinearity reduction with vifcor()
pairs(env_stack)
# calculate variance inflation factor per predictor, a metric of multicollinearity between variables
vif(env_stack)
## Variables VIF
## 1 WC_alt 4.647849
## 2 WC_bio1 40.294073
## 3 WC_bio2 7.296272
## 4 WC_bio4 30.552457
## 5 WC_tmean1 90.022665
## 6 ER_tri 4.810737
## 7 ER_topoWet 4.624531
# stepwise reduce predictors, based on a max correlation of 0.7 (max 1)
v <- vifcor(env_stack, th=0.7)
v
## 3 variables from the 7 input variables have collinearity problem:
##
## WC_tmean1 ER_tri WC_bio1
##
## After excluding the collinear variables, the linear correlation coefficients ranges between:
## min correlation ( ER_topoWet ~ WC_bio2 ): 0.076187
## max correlation ( WC_bio2 ~ WC_alt ): 0.6055451
##
## ---------- VIFs of the remained variables --------
## Variables VIF
## 1 WC_alt 2.717068
## 2 WC_bio2 2.350683
## 3 WC_bio4 1.534665
## 4 ER_topoWet 1.968989
# reduce enviromental raster stack by
env_stack_v <- usdm::exclude(env_stack, v)
# show pairs plot after multicollinearity reduction with vifcor()
pairs(env_stack_v)
# fit a maximum entropy model
if (!file.exists(mdl_maxv_rds)){
mdl_maxv <- maxent(env_stack_v, sf::as_Spatial(pts_train))
readr::write_rds(mdl_maxv, mdl_maxv_rds)
}
mdl_maxv <- read_rds(mdl_maxv_rds)
# plot variable contributions per predictor
plot(mdl_maxv)
Question: Which variables were removed due to multicollinearity and what is the rank of most to least important remaining variables in your model?
WC_bio1, WC_tmean1, and ER_tri were removed for multicollinearity. Rankings of remaining variables are below. - ER_topoWet - WC_bio4 - WC_alt - WC_bio2
# plot term plots
response(mdl_maxv)
# predict
y_maxv <- predict(env_stack, mdl_maxv) #, ext=ext, progress='')
plot(y_maxv, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')
pts_test_p <- pts_test %>%
filter(present == 1) %>%
as_Spatial()
pts_test_a <- pts_test %>%
filter(present == 0) %>%
as_Spatial()
y_maxv <- predict(mdl_maxv, env_stack)
#plot(y_maxv)
e <- dismo::evaluate(
p = pts_test_p,
a = pts_test_a,
model = mdl_maxv,
x = env_stack)
e
## class : ModelEvaluation
## n presences : 295
## n absences : 300
## AUC : 0.8834802
## cor : 0.668465
## max TPR+TNR at : 0.6589146
plot(e, 'ROC')
thr <- threshold(e)[['spec_sens']]
thr
## [1] 0.6589146
p_true <- na.omit(raster::extract(y_maxv, pts_test_p) >= thr)
a_true <- na.omit(raster::extract(y_maxv, pts_test_a) < thr)
# (t)rue/(f)alse (p)ositive/(n)egative rates
tpr <- sum(p_true)/length(p_true)
fnr <- sum(!p_true)/length(p_true)
fpr <- sum(!a_true)/length(a_true)
tnr <- sum(a_true)/length(a_true)
matrix(
c(tpr, fnr,
fpr, tnr),
nrow=2, dimnames = list(
c("present_obs", "absent_obs"),
c("present_pred", "absent_pred")))
## present_pred absent_pred
## present_obs 0.8915254 0.2166667
## absent_obs 0.1084746 0.7833333
# add point to ROC plot
points(fpr, tpr, pch=23, bg="blue")
plot(y_maxv > thr)